data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
daily_data = fread(data_path)
str(daily_data)
## Classes 'data.table' and 'data.frame':   4331 obs. of  13 variables:
##  $ event_date         : IDate, format: "2021-05-31" "2021-05-31" ...
##  $ product_content_id : int  85004 4066298 6676673 7061886 31515569 32737302 32939029 48740784 73318567 85004 ...
##  $ price              : num  87.4 65.8 120.7 294.4 60.3 ...
##  $ sold_count         : int  80 1398 345 24 454 44 125 2 175 85 ...
##  $ visit_count        : int  6002 19102 19578 1485 20114 5017 5301 221 20256 5265 ...
##  $ basket_count       : int  744 5096 1261 81 2871 316 608 15 965 633 ...
##  $ favored_count      : int  1642 1703 1510 122 2102 607 698 22 1960 1104 ...
##  $ category_sold      : int  5048 6547 4944 951 8155 5198 930 1684 5198 4117 ...
##  $ category_visits    : int  236197 108811 306462 95645 637143 1010634 41973 329087 1010634 215260 ...
##  $ category_basket    : int  33681 28558 23418 4704 49389 33728 3911 12614 33728 25181 ...
##  $ category_favored   : int  40472 11913 26597 8886 62460 96699 5791 24534 96699 36225 ...
##  $ category_brand_sold: int  743 4286 786 179 1759 3665 875 12 3665 430 ...
##  $ ty_visits          : int  125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 131821083 ...
##  - attr(*, ".internal.selfref")=<externalptr>
daily_data$event_date <- as.Date(daily_data$event_date, "%d.%m.%Y")

We divided ‘daily_data’ into different data tables in terms of products to examine and understand more easily in the future parts.

product1_data <- daily_data[product_content_id == 48740784]
product2_data <- daily_data[product_content_id == 73318567]
product3_data <- daily_data[product_content_id == 32737302]
product4_data <- daily_data[product_content_id == 31515569]
product5_data <- daily_data[product_content_id == 6676673]
product6_data <- daily_data[product_content_id == 7061886]
product7_data <- daily_data[product_content_id == 85004]
product8_data <- daily_data[product_content_id == 4066298]
product9_data <- daily_data[product_content_id == 32939029]

ANALYSIS of Product1

ggplot(product1_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 1 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product1_data$sold_count, lag.max = 365, main = "The ACF of the Product1")

There are high autocorrelation values for some recent lags.There is no obvious trend but there may be a daily seasonality.

ANALYSIS of Product2

ggplot(product2_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 2 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product2_data$sold_count, lag.max = 365, main = "The ACF of the Product2")

There are high and decreasing autocorrelation values for recent lags.Initially Decreasing autocorrelation values show us a trend.

ANALYSIS of Product3

ggplot(product3_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 3 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product3_data$sold_count, lag.max = 365, main = "The ACF of the Product3")

There are high and decreasing autocorrelation values for recent lags.Initially decreasing autocorrelation values show us a trend.While there is a downward trend between May 2020 and July 2020, there is a upward trend after March 2021.Since it is a product used in summer, sales start and increase before summer, and sales decrease towards the end of summer.

ANALYSIS of Product4

ggplot(product4_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 4 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product4_data$sold_count, lag.max = 365, main = "The ACF of the Product4")

When we look at the acf chart of product4, there is a trend, although it is not as obvious as the trend in product2 and 3.The decrease in the initial lags indicates the trend, but since this decrease is only for a few lags, we understand that the trend in sales values is short-term.Also it may have weekly and monthly seasonality.

ANALYSIS of Product5

ggplot(product5_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 5 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product5_data$sold_count, lag.max = 365, main = "The ACF of the Product5")

There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 5. There are significant autocorrelation around lag 30.There is a weekly seasonality, this is probably due to salary days or discount days that is specific days or periods(first week or last week of month) in month.

ANALYSIS of Product6

ggplot(product6_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 6 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product6_data$sold_count, lag.max = 365, main = "The ACF of the Product6")

There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 6.There are significant autocorrelation around lag 16.

ANALYSIS of Product7

ggplot(product7_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 7 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product7_data$sold_count, lag.max = 365, main = "The ACF of the Product7")

There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 7. There are significant autocorrelation around lag 15.

ANALYSIS of Product8

ggplot(product8_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 8 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product8_data$sold_count, lag.max = 365, main = "The ACF of the Product8")

There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 8.

ANALYSIS of Product9

ggplot(product9_data, aes(x = event_date, y = sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 9 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

acf(product9_data$sold_count, lag.max = 365, main = "The ACF of the Product9")

There are high and decreasing autocorrelation values for many recent lags ,so there is a trend in sales data of product 9.The decrease in the initial lags indicates the trend, but since this decrease is for many lags, we understand that the trend in sales values is long time period.

Decomposing at Daily Level

In general, the sales values of all products include trend or seasonality.Therefore, none of them are stationary.We need to decomposition to make them stationary.Firstly,we did decomposition at daily level.

product1_ts <- ts(product1_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product2_ts <- ts(product2_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product3_ts <- ts(product3_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product4_ts <- ts(product4_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product5_ts <- ts(product5_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product6_ts <- ts(product6_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product7_ts <- ts(product7_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product8_ts <- ts(product8_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product9_ts <- ts(product9_data$sold_count, start = as.Date("2020-05-25"), 
    end = as.Date("2021-05-31"), frequency = 7)
product1_dec_daily <- decompose(x = product1_ts, type = "additive")
product2_dec_daily <- decompose(x = product2_ts, type = "additive")
product3_dec_daily <- decompose(x = product3_ts, type = "additive")
product4_dec_daily <- decompose(x = product4_ts, type = "additive")
product5_dec_daily <- decompose(x = product5_ts, type = "additive")
product6_dec_daily <- decompose(x = product6_ts, type = "additive")
product7_dec_daily <- decompose(x = product7_ts, type = "additive")
product8_dec_daily <- decompose(x = product8_ts, type = "additive")
product9_dec_daily <- decompose(x = product9_ts, type = "additive")

ACF plot of random component of products that are decomposed at daily level:

acf(product1_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product1")

acf(product2_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product2")

acf(product3_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product3")

acf(product4_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product4")

acf(product5_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product5")

acf(product6_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product6")

acf(product7_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product7")

acf(product8_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product8")

acf(product9_dec_daily$random, na.action = na.pass, lag.max = 365, 
    main = "The ACF of the Random component for Product9")

There are still high autocorrelation values but they are generally purified from trend and seasonality.

summary(ur.kpss(product1_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0018 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product2_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0041 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product3_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0044 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product4_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0018 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product5_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0019 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product6_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0018 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product7_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0018 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product8_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0019 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product9_dec_daily$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 9 lags. 
## 
## Value of test-statistic is: 0.0031 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

The unit root tests suggest that the random components are stationary.

Decomposing at Weekly Level

After daily decomposition,we did decomposition at weekly level.While doing the weekly decomposition, we first found the weekly mean of the daily values.

weekly_product1_data = product1_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product2_data = product2_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product3_data = product3_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product4_data = product4_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product5_data = product5_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product6_data = product6_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product7_data = product7_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product8_data = product8_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product9_data = product9_data[, list(weekly_mean_sold_count = mean(sold_count, 
    na.rm = T)), by = list(week(event_date), year(event_date))]

weekly_product9_data
##     week year weekly_mean_sold_count
##  1:   22 2021             144.750000
##  2:   21 2021             181.571429
##  3:   20 2021             317.714286
##  4:   19 2021             407.285714
##  5:   18 2021             288.000000
##  6:   17 2021             210.857143
##  7:   16 2021             190.142857
##  8:   15 2021             191.000000
##  9:   14 2021             277.142857
## 10:   13 2021             195.000000
## 11:   12 2021              81.000000
## 12:   11 2021              99.428571
## 13:   10 2021             196.428571
## 14:    9 2021             129.714286
## 15:    8 2021             135.714286
## 16:    7 2021              73.428571
## 17:    6 2021              79.285714
## 18:    5 2021              83.857143
## 19:    4 2021              26.571429
## 20:    3 2021             119.571429
## 21:    2 2021             151.857143
## 22:    1 2021              44.714286
## 23:   53 2020              14.500000
## 24:   52 2020              75.142857
## 25:   51 2020              73.285714
## 26:   50 2020              46.571429
## 27:   49 2020              99.714286
## 28:   48 2020              49.142857
## 29:   47 2020              11.571429
## 30:   46 2020              69.000000
## 31:   45 2020              85.142857
## 32:   44 2020              11.571429
## 33:   43 2020              13.142857
## 34:   42 2020              17.857143
## 35:   41 2020              73.142857
## 36:   40 2020              57.571429
## 37:   39 2020              55.285714
## 38:   38 2020              39.857143
## 39:   37 2020              66.571429
## 40:   36 2020              59.857143
## 41:   35 2020              78.285714
## 42:   34 2020              48.285714
## 43:   33 2020              36.571429
## 44:   32 2020              11.571429
## 45:   31 2020              11.571429
## 46:   30 2020              17.571429
## 47:   29 2020              15.142857
## 48:   28 2020              20.000000
## 49:   27 2020               4.000000
## 50:   26 2020               3.571429
## 51:   25 2020              14.571429
## 52:   24 2020              54.428571
## 53:   23 2020              31.285714
## 54:   22 2020              56.857143
## 55:   21 2020              87.500000
##     week year weekly_mean_sold_count

Graphs of weekly mean sold counts of products:

ggplot(weekly_product1_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 1 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product2_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 2 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product3_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 3 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product4_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 4 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product5_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 5 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product6_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 6 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product7_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 7 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product8_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 8 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

ggplot(weekly_product9_data, aes(x = week, y = weekly_mean_sold_count)) + 
    geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 9 Graph", 
    x = "Time", y = "Sales quantity") + theme_dark()

The aim is to purify the seasonality that is effect of factors such as price reductions and paydays that occur in the same periods of the months.

weekly_product1_ts <- ts(weekly_product1_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product2_ts <- ts(weekly_product2_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product3_ts <- ts(weekly_product3_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product4_ts <- ts(weekly_product4_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product5_ts <- ts(weekly_product5_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product6_ts <- ts(weekly_product6_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product7_ts <- ts(weekly_product7_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product8_ts <- ts(weekly_product8_data$weekly_mean_sold_count, 
    frequency = 4)
weekly_product9_ts <- ts(weekly_product9_data$weekly_mean_sold_count, 
    frequency = 4)

product1_dec_weekly <- decompose(x = weekly_product1_ts, type = "additive")
product2_dec_weekly <- decompose(x = weekly_product2_ts, type = "additive")
product3_dec_weekly <- decompose(x = weekly_product3_ts, type = "additive")
product4_dec_weekly <- decompose(x = weekly_product4_ts, type = "additive")
product5_dec_weekly <- decompose(x = weekly_product5_ts, type = "additive")
product6_dec_weekly <- decompose(x = weekly_product6_ts, type = "additive")
product7_dec_weekly <- decompose(x = weekly_product7_ts, type = "additive")
product8_dec_weekly <- decompose(x = weekly_product8_ts, type = "additive")
product9_dec_weekly <- decompose(x = weekly_product9_ts, type = "additive")

ACF plots of random components of products that are decomposed at weekly level:

acf(product1_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product1")

acf(product2_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product2")

acf(product3_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product3")

acf(product4_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product4")

acf(product5_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product5")

acf(product6_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product6")

acf(product7_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product7")

acf(product8_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product8")

acf(product9_dec_weekly$random, na.action = na.pass, lag.max = 52, 
    main = "The ACF of the Random component for Product9")

There are still high autocorrelation values but they are generally purified from trend and seasonality.

summary(ur.kpss(product1_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0429 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product2_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0916 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product3_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.086 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product4_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0362 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product5_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0738 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product6_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0366 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product7_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.042 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product8_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0695 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
summary(ur.kpss(product9_dec_weekly$random))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.1822 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

The unit root tests suggest that the random components are stationary.However, the random components in the daily decomposition had lower values in the unit root test than those in the weekly decomposition.Therefore, we can say that the random components of the daily decomposition are more stationary.

We also set up the monthly decomposition model, but since the sales data of the products are 1 year, when we make them monthly_ts, they consist of only one observation and cannot be decomposed.

ARIMA MODELS

we build arima models with random components of daily decomposition.

product1_arima_model = auto.arima(product1_dec_daily$random, 
    max.p = 2, max.q = 2)
product1_arima_model
## Series: product1_dec_daily$random 
## ARIMA(0,0,1)(0,0,1)[7] with non-zero mean 
## 
## Coefficients:
##          ma1    sma1     mean
##       0.0890  0.2355  -0.0005
## s.e.  0.0284  0.0209   0.0710
## 
## sigma^2 estimated as 7.236:  log likelihood=-6241.55
## AIC=12491.09   AICc=12491.11   BIC=12514.53
product2_arima_model = auto.arima(product2_dec_daily$random, 
    max.p = 2, max.q = 2)
product2_arima_model
## Series: product2_dec_daily$random 
## ARIMA(2,0,0)(2,0,1)[7] with zero mean 
## 
## Coefficients:
##          ar1      ar2    sar1    sar2     sma1
##       0.3671  -0.4402  0.2061  0.1235  -0.1754
## s.e.  0.0189   0.0187  0.1113  0.0216   0.1122
## 
## sigma^2 estimated as 123.6:  log likelihood=-9918.79
## AIC=19849.58   AICc=19849.61   BIC=19884.74
product3_arima_model = auto.arima(product3_dec_daily$random, 
    max.p = 2, max.q = 2)
product3_arima_model
## Series: product3_dec_daily$random 
## ARIMA(2,0,0)(2,0,2)[7] with non-zero mean 
## 
## Coefficients:
##          ar1      ar2    sar1     sar2     sma1    sma2     mean
##       0.2245  -0.3637  1.1567  -0.8305  -1.1510  0.7316  -0.0056
## s.e.  0.0195   0.0196  0.0789   0.0421   0.1016  0.0647   0.0765
## 
## sigma^2 estimated as 26.56:  log likelihood=-7925.22
## AIC=15866.43   AICc=15866.49   BIC=15913.31
product4_arima_model = auto.arima(product4_dec_daily$random, 
    max.p = 2, max.q = 2)
product4_arima_model
## Series: product4_dec_daily$random 
## ARIMA(1,0,2)(0,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     ma2    sma1
##       -0.1484  0.5623  0.1678  0.1263
## s.e.   0.0900  0.0866  0.0332  0.0201
## 
## sigma^2 estimated as 315640:  log likelihood=-20086.45
## AIC=40182.9   AICc=40182.92   BIC=40212.2
product5_arima_model = auto.arima(product5_dec_daily$random, 
    max.p = 2, max.q = 2)
product5_arima_model
## Series: product5_dec_daily$random 
## ARIMA(1,0,1)(1,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     sar1    sma1
##       -0.0229  0.2056  -0.1856  0.2837
## s.e.   0.0530  0.0491   0.1669  0.1632
## 
## sigma^2 estimated as 15049:  log likelihood=-16142.25
## AIC=32294.49   AICc=32294.52   BIC=32323.79
product6_arima_model = auto.arima(product6_dec_daily$random, 
    max.p = 2, max.q = 2)
product6_arima_model
## Series: product6_dec_daily$random 
## ARIMA(1,0,2)(1,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     ma2     sar1    sma1
##       -0.2801  0.5579  0.3024  -0.5928  0.6915
## s.e.   0.0598  0.0547  0.0217   0.0719  0.0643
## 
## sigma^2 estimated as 388.7:  log likelihood=-11403.31
## AIC=22818.62   AICc=22818.65   BIC=22853.78
product7_arima_model = auto.arima(product7_dec_daily$random, 
    max.p = 2, max.q = 2)
product7_arima_model
## Series: product7_dec_daily$random 
## ARIMA(1,0,2)(2,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     ma2     sar1     sar2    sma1
##       -0.5086  0.8848  0.3308  -0.4717  -0.0636  0.5021
## s.e.   0.1020  0.0937  0.0239   0.1614   0.0238  0.1614
## 
## sigma^2 estimated as 1046:  log likelihood=-12685.84
## AIC=25385.69   AICc=25385.73   BIC=25426.71
product8_arima_model = auto.arima(product8_dec_daily$random, 
    max.p = 2, max.q = 2)
product8_arima_model
## Series: product8_dec_daily$random 
## ARIMA(0,0,1)(0,0,1)[7] with zero mean 
## 
## Coefficients:
##          ma1    sma1
##       0.2999  0.0483
## s.e.  0.0173  0.0193
## 
## sigma^2 estimated as 67920:  log likelihood=-18096.36
## AIC=36198.71   AICc=36198.72   BIC=36216.29
product9_arima_model = auto.arima(product9_dec_daily$random, 
    max.p = 2, max.q = 2)
product9_arima_model
## Series: product9_dec_daily$random 
## ARIMA(1,0,1)(1,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     sar1    sma1
##       -0.0651  0.4105  -0.0801  0.2063
## s.e.   0.0385  0.0325   0.1519  0.1494
## 
## sigma^2 estimated as 1137:  log likelihood=-12795.32
## AIC=25600.64   AICc=25600.66   BIC=25629.94

Question 3 starts here

In this part, we will be trying to investigate the possible regressors that are likely to increase the quality of our arima models for each product. Some manipulations are made in order to get rid of the missing values in the product datas.

For each product, we will try to find a relationship between residuals and possible regressors by trying them sequentially. We checked the weekly difference values.

##Product1 ARIMAX

product1_data[, `:=`(price_diff, price - shift(price, 1))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(price_diff, res)], col = day(product1_data$event_date))

product1_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(visit_count_diff, res)], col = day(product1_data$event_date))

product1_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(visit_count_diff, res)], col = day(product1_data$event_date))

product1_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(favored_count_diff, res)], col = day(product1_data$event_date))

product1_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    7))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(category_sold_diff, res)], col = day(product1_data$event_date))

product1_data[, `:=`(category_basket_diff, category_basket - 
    shift(category_basket, 7))]

product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]

plot(product1_data[, list(category_basket_diff, res)], col = day(product1_data$event_date))

There are lots of missing values on the Product1 data. Therefore, it’s hard to find a relationship between regressors and residuals.

product1_data[is.na(price_diff) == T, `:=`(price_diff, 0)]
price_diff_mat_p1 <- matrix(product1_data$price_diff)
arimax_product1 = auto.arima(product1_data$sold_count, xreg = price_diff_mat_p1, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,0,0) with zero mean     : 2011.946
##  ARIMA(0,0,0) with non-zero mean : 1993.988
##  ARIMA(0,0,1) with zero mean     : 1947.225
##  ARIMA(0,0,1) with non-zero mean : 1936.938
##  ARIMA(0,0,2) with zero mean     : 1942.397
##  ARIMA(0,0,2) with non-zero mean : 1934.058
##  ARIMA(0,0,3) with zero mean     : 1928.233
##  ARIMA(0,0,3) with non-zero mean : 1922.884
##  ARIMA(0,0,4) with zero mean     : 1927.623
##  ARIMA(0,0,4) with non-zero mean : 1921.127
##  ARIMA(0,0,5) with zero mean     : 1923.841
##  ARIMA(0,0,5) with non-zero mean : 1918.159
##  ARIMA(1,0,0) with zero mean     : 1932.64
##  ARIMA(1,0,0) with non-zero mean : 1926.603
##  ARIMA(1,0,1) with zero mean     : 1931.647
##  ARIMA(1,0,1) with non-zero mean : 1927.301
##  ARIMA(1,0,2) with zero mean     : 1911.999
##  ARIMA(1,0,2) with non-zero mean : 1912.7
##  ARIMA(1,0,3) with zero mean     : 1924.746
##  ARIMA(1,0,3) with non-zero mean : 1918.114
##  ARIMA(1,0,4) with zero mean     : 1926.713
##  ARIMA(1,0,4) with non-zero mean : 1920.203
##  ARIMA(2,0,0) with zero mean     : 1932.882
##  ARIMA(2,0,0) with non-zero mean : 1927.751
##  ARIMA(2,0,1) with zero mean     : 1913.405
##  ARIMA(2,0,1) with non-zero mean : 1914.101
##  ARIMA(2,0,2) with zero mean     : 1908.487
##  ARIMA(2,0,2) with non-zero mean : 1909.145
##  ARIMA(2,0,3) with zero mean     : 1907.64
##  ARIMA(2,0,3) with non-zero mean : 1908.355
##  ARIMA(3,0,0) with zero mean     : 1931.996
##  ARIMA(3,0,0) with non-zero mean : 1927.914
##  ARIMA(3,0,1) with zero mean     : 1922.155
##  ARIMA(3,0,1) with non-zero mean : 1917.875
##  ARIMA(3,0,2) with zero mean     : 1906.838
##  ARIMA(3,0,2) with non-zero mean : 1907.562
##  ARIMA(4,0,0) with zero mean     : 1934.044
##  ARIMA(4,0,0) with non-zero mean : 1929.788
##  ARIMA(4,0,1) with zero mean     : 1923.672
##  ARIMA(4,0,1) with non-zero mean : 1918.782
##  ARIMA(5,0,0) with zero mean     : 1913.727
##  ARIMA(5,0,0) with non-zero mean : 1911.807
## 
## 
## 
##  Best model: Regression with ARIMA(3,0,2) errors

##Product2 ARIMAX

Product2’s price is constant most of the time. Therefore, it doesn’t make sense to add price as regressor. Instead, we will check the category_sold, visit_count and favored_count as potential regressors. We can’t check the basket_count since it’s full with NA values.

Since we decomposed weekly, we are checking the weekly differences of potential regressors.

###Category_sold

product2_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    7))]

product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]

plot(product2_data[, list(category_sold_diff, res)], col = day(product2_data$event_date))

It doesn’t seem that there is a relationship between residuals and weekly category_sold differencing.

Visit_count

product2_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]


product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]

plot(product2_data[, list(visit_count_diff, res)], col = day(product2_data$event_date))

There doesn’t seem a relation again.

Favored_count

product2_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]


product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]

plot(product2_data[, list(favored_count_diff, res)], col = day(product2_data$event_date))

We can’t see a clear relationship again. Visually, it looks like visit_count_diff seems best to go with among these 3 alternatives.

product2_data[is.na(visit_count_diff) == T, `:=`(visit_count_diff, 
    0)]
visit_count_diff_mat_p2 <- matrix(product1_data$visit_count_diff)
arimax_product2 = auto.arima(product2_data$sold_count, xreg = visit_count_diff_mat_p2, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,1,0)                    : 2878.313
##  Regression with ARIMA(0,1,0) errors : 2879.804
##  ARIMA(0,1,1)                    : 2864.49
##  Regression with ARIMA(0,1,1) errors : 2865.415
##  ARIMA(0,1,2)                    : 2851.691
##  Regression with ARIMA(0,1,2) errors : 2852.139
##  ARIMA(0,1,3)                    : 2852.403
##  Regression with ARIMA(0,1,3) errors : 2853.04
##  ARIMA(0,1,4)                    : 2853.9
##  Regression with ARIMA(0,1,4) errors : 2854.473
##  ARIMA(0,1,5)                    : 2817.056
##  Regression with ARIMA(0,1,5) errors : 2817.206
##  ARIMA(1,1,0)                    : 2871.023
##  Regression with ARIMA(1,1,0) errors : 2872.262
##  ARIMA(1,1,1)                    : 2859.361
##  Regression with ARIMA(1,1,1) errors : 2859.766
##  ARIMA(1,1,2)                    : 2852.651
##  Regression with ARIMA(1,1,2) errors : 2853.212
##  ARIMA(1,1,3)                    : 2854.425
##  Regression with ARIMA(1,1,3) errors : 2855.072
##  ARIMA(1,1,4)                    : 2850.092
##  Regression with ARIMA(1,1,4) errors : 2850.348
##  ARIMA(2,1,0)                    : 2847.763
##  Regression with ARIMA(2,1,0) errors : 2848.405
##  ARIMA(2,1,1)                    : 2846.962
##  Regression with ARIMA(2,1,1) errors : 2847.698
##  ARIMA(2,1,2)                    : Inf
##  Regression with ARIMA(2,1,2) errors : Inf
##  ARIMA(2,1,3)                    : 2837.214
##  Regression with ARIMA(2,1,3) errors : 2838.285
##  ARIMA(3,1,0)                    : 2845.51
##  Regression with ARIMA(3,1,0) errors : 2846.365
##  ARIMA(3,1,1)                    : 2847.575
##  Regression with ARIMA(3,1,1) errors : 2848.442
##  ARIMA(3,1,2)                    : 2837.905
##  Regression with ARIMA(3,1,2) errors : 2838.937
##  ARIMA(4,1,0)                    : 2847.569
##  Regression with ARIMA(4,1,0) errors : 2848.44
##  ARIMA(4,1,1)                    : 2844.678
##  Regression with ARIMA(4,1,1) errors : 2844.963
##  ARIMA(5,1,0)                    : 2835.786
##  Regression with ARIMA(5,1,0) errors : 2836.318
## 
## 
## 
##  Best model: Regression with ARIMA(0,1,5) errors

##Product3 ARIMAX

product3_data[, `:=`(price_diff, price - shift(price, 7))]

product3_data[, `:=`(res, c(rep(NA, 7), as.numeric(product3_arima_model$residuals[1:365])))]

plot(product3_data[, list(price_diff, res)], col = day(product3_data$event_date))

product3_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product3_data[, list(visit_count_diff, res)], col = day(product3_data$event_date))

product3_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product3_data[, list(basket_count_diff, res)], col = day(product3_data$event_date))

product3_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product3_data[, list(favored_count_diff, res)], col = day(product3_data$event_date))

We can see a trend in basket_count. Therefore, we will go with it.

product3_data[is.na(basket_count) == T, `:=`(basket_count, 0)]
basket_count_mat_p3 <- matrix(product3_data$basket_count)
arimax_product3 = auto.arima(product3_data$sold_count, xreg = basket_count_mat_p3, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,0,0) with zero mean     : 2303.674
##  ARIMA(0,0,0) with non-zero mean : 2303.081
##  ARIMA(0,0,1) with zero mean     : 2221.798
##  ARIMA(0,0,1) with non-zero mean : 2221.676
##  ARIMA(0,0,2) with zero mean     : 2217.999
##  ARIMA(0,0,2) with non-zero mean : 2217.773
##  ARIMA(0,0,3) with zero mean     : 2216.895
##  ARIMA(0,0,3) with non-zero mean : 2217.031
##  ARIMA(0,0,4) with zero mean     : 2216.274
##  ARIMA(0,0,4) with non-zero mean : 2216.338
##  ARIMA(0,0,5) with zero mean     : 2207.519
##  ARIMA(0,0,5) with non-zero mean : 2208.101
##  ARIMA(1,0,0) with zero mean     : 2211.654
##  ARIMA(1,0,0) with non-zero mean : 2211.651
##  ARIMA(1,0,1) with zero mean     : 2213.669
##  ARIMA(1,0,1) with non-zero mean : 2213.657
##  ARIMA(1,0,2) with zero mean     : Inf
##  ARIMA(1,0,2) with non-zero mean : 2192.597
##  ARIMA(1,0,3) with zero mean     : Inf
##  ARIMA(1,0,3) with non-zero mean : 2194.232
##  ARIMA(1,0,4) with zero mean     : Inf
##  ARIMA(1,0,4) with non-zero mean : 2192.68
##  ARIMA(2,0,0) with zero mean     : 2213.687
##  ARIMA(2,0,0) with non-zero mean : 2213.687
##  ARIMA(2,0,1) with zero mean     : Inf
##  ARIMA(2,0,1) with non-zero mean : 2199.563
##  ARIMA(2,0,2) with zero mean     : Inf
##  ARIMA(2,0,2) with non-zero mean : 2194.411
##  ARIMA(2,0,3) with zero mean     : 2193.684
##  ARIMA(2,0,3) with non-zero mean : 2185.4
##  ARIMA(3,0,0) with zero mean     : 2206.848
##  ARIMA(3,0,0) with non-zero mean : 2207.107
##  ARIMA(3,0,1) with zero mean     : Inf
##  ARIMA(3,0,1) with non-zero mean : 2191.253
##  ARIMA(3,0,2) with zero mean     : 2180.082
##  ARIMA(3,0,2) with non-zero mean : 2180.692
##  ARIMA(4,0,0) with zero mean     : 2202.974
##  ARIMA(4,0,0) with non-zero mean : 2202.874
##  ARIMA(4,0,1) with zero mean     : Inf
##  ARIMA(4,0,1) with non-zero mean : 2192.681
##  ARIMA(5,0,0) with zero mean     : 2201.788
##  ARIMA(5,0,0) with non-zero mean : 2201.751
## 
## 
## 
##  Best model: Regression with ARIMA(3,0,2) errors

##Product4 ARIMAX

product4_data[, `:=`(price_diff, price - shift(price, 7))]

product4_data[, `:=`(res, c(rep(NA, 7), as.numeric(product4_arima_model$residuals[1:365])))]

plot(product4_data[, list(price_diff, res)], col = day(product4_data$event_date))

product4_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product4_data[, list(visit_count_diff, res)], col = day(product4_data$event_date))

product4_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product4_data[, list(basket_count_diff, res)], col = day(product4_data$event_date))

product4_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product4_data[, list(favored_count_diff, res)], col = day(product4_data$event_date))

product4_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    4))]
plot(product4_data[, list(category_sold_diff, res)], col = day(product4_data$event_date))

We see a decreasing residual trend with the increasing category_sold_differencing levels. Notice here the rate is not 7 but 4.

product4_data[is.na(category_sold_diff) == T, `:=`(category_sold_diff, 
    0)]
category_sold_diff_mat_p4 <- matrix(product4_data$category_sold_diff)
arimax_product4 = auto.arima(product4_data$sold_count, xreg = category_sold_diff_mat_p4, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,1,0)                    : 5669.004
##  Regression with ARIMA(0,1,0) errors : 5671.036
##  ARIMA(0,1,1)                    : 5664.402
##  Regression with ARIMA(0,1,1) errors : 5666.445
##  ARIMA(0,1,2)                    : 5665.855
##  Regression with ARIMA(0,1,2) errors : 5667.909
##  ARIMA(0,1,3)                    : 5663.487
##  Regression with ARIMA(0,1,3) errors : 5665.553
##  ARIMA(0,1,4)                    : 5648.298
##  Regression with ARIMA(0,1,4) errors : 5650.366
##  ARIMA(0,1,5)                    : 5615.193
##  Regression with ARIMA(0,1,5) errors : 5617.282
##  ARIMA(1,1,0)                    : 5664.959
##  Regression with ARIMA(1,1,0) errors : 5667.001
##  ARIMA(1,1,1)                    : 5638.573
##  Regression with ARIMA(1,1,1) errors : 5640.628
##  ARIMA(1,1,2)                    : 5640.523
##  Regression with ARIMA(1,1,2) errors : 5642.589
##  ARIMA(1,1,3)                    : 5651.508
##  Regression with ARIMA(1,1,3) errors : 5653.585
##  ARIMA(1,1,4)                    : 5642.272
##  Regression with ARIMA(1,1,4) errors : 5644.355
##  ARIMA(2,1,0)                    : 5666.475
##  Regression with ARIMA(2,1,0) errors : 5668.529
##  ARIMA(2,1,1)                    : 5640.504
##  Regression with ARIMA(2,1,1) errors : 5642.571
##  ARIMA(2,1,2)                    : 5638.239
##  Regression with ARIMA(2,1,2) errors : 5640.569
##  ARIMA(2,1,3)                    : 5637.833
##  Regression with ARIMA(2,1,3) errors : 5639.921
##  ARIMA(3,1,0)                    : 5663.66
##  Regression with ARIMA(3,1,0) errors : 5665.725
##  ARIMA(3,1,1)                    : 5659.539
##  Regression with ARIMA(3,1,1) errors : 5661.614
##  ARIMA(3,1,2)                    : 5632.567
##  Regression with ARIMA(3,1,2) errors : 5634.654
##  ARIMA(4,1,0)                    : 5659.181
##  Regression with ARIMA(4,1,0) errors : 5661.256
##  ARIMA(4,1,1)                    : 5658.582
##  Regression with ARIMA(4,1,1) errors : 5660.668
##  ARIMA(5,1,0)                    : 5655.374
##  Regression with ARIMA(5,1,0) errors : 5657.462
## 
## 
## 
##  Best model: Regression with ARIMA(0,1,5) errors

##Product5 ARIMAX

product5_data[, `:=`(price_diff, price - shift(price, 4))]

product5_data[, `:=`(res, c(rep(NA, 7), as.numeric(product5_arima_model$residuals[1:365])))]

plot(product5_data[, list(price_diff, res)], col = day(product5_data$event_date))

product5_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    4))]
plot(product5_data[, list(basket_count_diff, res)], col = day(product5_data$event_date))

product5_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    4))]
plot(product5_data[, list(favored_count_diff, res)], col = day(product5_data$event_date))

product5_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    4))]
plot(product5_data[, list(category_sold_diff, res)], col = day(product5_data$event_date))

product5_data[, `:=`(category_brand_sold_diff, category_brand_sold - 
    shift(category_brand_sold, 4))]
plot(product5_data[, list(category_brand_sold_diff, res)], col = day(product5_data$event_date))

With 4 levels of differencing, we can see trends in residuals.

arima_p5 <- arima(product5_data$sold_count, order = c(1, 0, 1), 
    seasonal = c(1, 0, 1))
product5_data[, `:=`(residuals, residuals(arima_p5))]
residuals_mat_p5 <- matrix(product5_data$residuals)
arimax_product5 = auto.arima(product5_data$sold_count, xreg = residuals_mat_p5, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,0,0) with zero mean     : 5556.707
##  ARIMA(0,0,0) with non-zero mean : 4817.509
##  ARIMA(0,0,1) with zero mean     : Inf
##  ARIMA(0,0,1) with non-zero mean : Inf
##  ARIMA(0,0,2) with zero mean     : 4696.098
##  ARIMA(0,0,2) with non-zero mean : Inf
##  ARIMA(0,0,3) with zero mean     : Inf
##  ARIMA(0,0,3) with non-zero mean : Inf
##  ARIMA(0,0,4) with zero mean     : Inf
##  ARIMA(0,0,4) with non-zero mean : Inf
##  ARIMA(0,0,5) with zero mean     : Inf
##  ARIMA(0,0,5) with non-zero mean : Inf
##  ARIMA(1,0,0) with zero mean     : 4335.941
##  ARIMA(1,0,0) with non-zero mean : 4315.347
##  ARIMA(1,0,1) with zero mean     : Inf
##  ARIMA(1,0,1) with non-zero mean : Inf
##  ARIMA(1,0,2) with zero mean     : Inf
##  ARIMA(1,0,2) with non-zero mean : Inf
##  ARIMA(1,0,3) with zero mean     : Inf
##  ARIMA(1,0,3) with non-zero mean : Inf
##  ARIMA(1,0,4) with zero mean     : Inf
##  ARIMA(1,0,4) with non-zero mean : Inf
##  ARIMA(2,0,0) with zero mean     : 4253.731
##  ARIMA(2,0,0) with non-zero mean : 4193.825
##  ARIMA(2,0,1) with zero mean     : Inf
##  ARIMA(2,0,1) with non-zero mean : Inf
##  ARIMA(2,0,2) with zero mean     : Inf
##  ARIMA(2,0,2) with non-zero mean : Inf
##  ARIMA(2,0,3) with zero mean     : Inf
##  ARIMA(2,0,3) with non-zero mean : Inf
##  ARIMA(3,0,0) with zero mean     : 4181.257
##  ARIMA(3,0,0) with non-zero mean : 4153.968
##  ARIMA(3,0,1) with zero mean     : Inf
##  ARIMA(3,0,1) with non-zero mean : Inf
##  ARIMA(3,0,2) with zero mean     : Inf
##  ARIMA(3,0,2) with non-zero mean : Inf
##  ARIMA(4,0,0) with zero mean     : 4181.667
##  ARIMA(4,0,0) with non-zero mean : 4147.894
##  ARIMA(4,0,1) with zero mean     : Inf
##  ARIMA(4,0,1) with non-zero mean : Inf
##  ARIMA(5,0,0) with zero mean     : 4157.536
##  ARIMA(5,0,0) with non-zero mean : 4136.147
## 
## 
## 
##  Best model: Regression with ARIMA(5,0,0) errors

##Product6 ARIMAX

product6_data[, `:=`(price_diff, price - shift(price, 7))]
product6_data[, `:=`(res, c(rep(NA, 7), as.numeric(product6_arima_model$residuals[1:365])))]

plot(product6_data[, list(price_diff, res)], col = day(product6_data$event_date))

product6_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product6_data[, list(visit_count_diff, res)], col = day(product6_data$event_date))

product6_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product6_data[, list(basket_count_diff, res)], col = day(product6_data$event_date))

product6_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product6_data[, list(favored_count_diff, res)], col = day(product6_data$event_date))

product6_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    7))]
plot(product6_data[, list(category_sold_diff, res)], col = day(product6_data$event_date))

product6_data[, `:=`(category_brand_sold_diff, category_brand_sold - 
    shift(category_brand_sold, 7))]
plot(product6_data[, list(category_brand_sold_diff, res)], col = day(product6_data$event_date))

product6_data[is.na(basket_count_diff) == T, `:=`(basket_count_diff, 
    0)]
basket_count_diff_mat_p6 <- matrix(product6_data$basket_count_diff)
arimax_product6 = auto.arima(product6_data$sold_count, xreg = basket_count_diff_mat_p6, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,1,0)                    : 3451.13
##  Regression with ARIMA(0,1,0) errors : 3453.162
##  ARIMA(0,1,1)                    : 3419.629
##  Regression with ARIMA(0,1,1) errors : 3421.67
##  ARIMA(0,1,2)                    : 3395.61
##  Regression with ARIMA(0,1,2) errors : 3397.608
##  ARIMA(0,1,3)                    : 3371.232
##  Regression with ARIMA(0,1,3) errors : 3373.215
##  ARIMA(0,1,4)                    : 3357.304
##  Regression with ARIMA(0,1,4) errors : 3359.275
##  ARIMA(0,1,5)                    : 3357.529
##  Regression with ARIMA(0,1,5) errors : 3359.528
##  ARIMA(1,1,0)                    : 3422.638
##  Regression with ARIMA(1,1,0) errors : 3424.68
##  ARIMA(1,1,1)                    : 3379.96
##  Regression with ARIMA(1,1,1) errors : 3381.922
##  ARIMA(1,1,2)                    : 3381.409
##  Regression with ARIMA(1,1,2) errors : 3383.379
##  ARIMA(1,1,3)                    : 3366.678
##  Regression with ARIMA(1,1,3) errors : 3368.654
##  ARIMA(1,1,4)                    : 3356.009
##  Regression with ARIMA(1,1,4) errors : 3358.003
##  ARIMA(2,1,0)                    : 3424.438
##  Regression with ARIMA(2,1,0) errors : 3426.491
##  ARIMA(2,1,1)                    : 3380.865
##  Regression with ARIMA(2,1,1) errors : 3382.833
##  ARIMA(2,1,2)                    : 3381.591
##  Regression with ARIMA(2,1,2) errors : 3383.569
##  ARIMA(2,1,3)                    : 3342.867
##  Regression with ARIMA(2,1,3) errors : 3344.893
##  ARIMA(3,1,0)                    : 3425.571
##  Regression with ARIMA(3,1,0) errors : 3427.635
##  ARIMA(3,1,1)                    : 3370.159
##  Regression with ARIMA(3,1,1) errors : 3372.153
##  ARIMA(3,1,2)                    : 3358.712
##  Regression with ARIMA(3,1,2) errors : 3360.76
##  ARIMA(4,1,0)                    : 3368.522
##  Regression with ARIMA(4,1,0) errors : 3370.593
##  ARIMA(4,1,1)                    : 3338.49
##  Regression with ARIMA(4,1,1) errors : 3340.532
##  ARIMA(5,1,0)                    : 3358.375
##  Regression with ARIMA(5,1,0) errors : 3360.453
## 
## 
## 
##  Best model: Regression with ARIMA(4,1,1) errors

##Product7 ARIMAX

product7_data[, `:=`(price_diff, price - shift(price, 7))]
product7_data[, `:=`(res, c(rep(NA, 7), as.numeric(product7_arima_model$residuals[1:365])))]

plot(product7_data[, list(price_diff, res)], col = day(product7_data$event_date))

product7_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product7_data[, list(visit_count_diff, res)], col = day(product7_data$event_date))

product7_data$basket_count
##   [1]  744  633  570  397  515  982  534  569  692  616  546  512  507  607  561
##  [16]  832 1141 1030  885 1010  688  897  862  908  983 1108  834  874  774  429
##  [31]  551 1114  970  866  893  517  413  375  360  313  321  331  380  436  368
##  [46]  441  502  409  423  507  702  584  545  583  885  871  799  716  646  583
##  [61]  438  421  461  366  449  565  307  263  264  340  316  416  330  285  373
##  [76]  364  385  329  241  420  589  796  769  944  548  495  424  402  413  460
##  [91]  428  536  627  552  394  442  468  470  583  766  612  482  482  450  403
## [106]  471  432  502  591  344  290  465  445  531  489  428  654  674 1001  579
## [121]  469  410  377  565  964  293  293  398  305  233  258  328  383  499  715
## [136]  608  585  665  866 1338  653  351  217  260  315  335  320  227  297  311
## [151]  207  124  268  311  283  252  167  182  225  637  404  545  461  400  294
## [166]  263  227  230  209  272  286  291  334  313  349  367  272  283  345  197
## [181]  181  192  188  349  544  817  623  542  519  197  187  170  131  176  154
## [196]  170  125  145  239  453  619  828 1021  882  891  252  307  154  130  107
## [211]  139  114   86  122  141  100  111  122  117  115  103  126  149  146  116
## [226]  180  141  165  199  475  154  130  145  195  311  286  342  589  479  164
## [241]  159  158  214  431  157  164  139  143  132  119  133  157  205  200  124
## [256]  117  184  165  200  159  135  206  206  432  393  739  372   91   89   62
## [271]  129   93  117  127  164  126  109  152   98  112  108  106  165  185  529
## [286]  407  377  264   96   91  130  161  139  102  105   97  112  141  124  139
## [301]  161  164  122  128  132  209   70  125   90   80  143  127   92   99   78
## [316]   89  105  140  115  122   80   90  104   99  137  224  365  300  321  159
## [331]  116  107  153  131  166  155  141  158  199  244  203  207  161  171  176
## [346]  146  228  353  365  434  131  140  221  196  279  199  151  159  215  203
## [361]  159  188  185  152  193  135  167  162  168  228  282  233
product7_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product7_data[, list(basket_count_diff, res)], col = day(product7_data$event_date))

product7_data$favored_count
##   [1] 1642 1104  913  868  985  932  583  711 1115 1290  917  950  711  827 1088
##  [16] 1034 2027 1121  696 1471  858  897  828 1119 1169 1774  906  897 1027 1188
##  [31]  535 1661  819  999 1359  689  801  700  470  397  396  465  426  550  548
##  [46]  557  567  544  593  573  927  634  584  696  886 1077 1172  894 2254 1730
##  [61]  519  917 1337  574  958 2856  643  891  462  487  486  646  412  409  522
##  [76]  675  508  438  548  389  777 1115 1063 1062  743  726  825  723  493  521
##  [91]  547  691  868  756  681  607  556  647  665 1712  734  648 1218  577  515
## [106]  543  928  844  642  471  419  560 1064  821  762  696  800  605  990 1003
## [121] 1080  823  696  812 1806  846 1096 2593  595  624  617 1059 1060 1339 1452
## [136] 1048  732  989 2198 2882 1140  713  507  612  635  583  962  526  923  689
## [151]  429  235  415  496  468  598  330  298  277  796  772  579  635  636  450
## [166]  371  311  484  382  472  424  642  767  468  657  872  953  368  383  391
## [181]  362  273  287    0    0    0    0    0    0    0    0    0    0    0    0
## [196]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [211]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [226]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [241]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [256]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [271]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [286]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [301]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [316]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [331]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [346]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [361]    0    0    0    0    0    0    0    0    0    0    0    0
product7_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product7_data[, list(favored_count_diff, res)], col = day(product7_data$event_date))

We see some degree of trend in visit_count.

product7_data[is.na(visit_count) == T, `:=`(visit_count, 0)]
visit_count_mat_p7 <- matrix(product7_data$visit_count)
arimax_product7 = auto.arima(product7_data$sold_count, xreg = visit_count_mat_p7, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,0,0) with zero mean     : 4309.262
##  ARIMA(0,0,0) with non-zero mean : 4106.892
##  ARIMA(0,0,1) with zero mean     : 4058.455
##  ARIMA(0,0,1) with non-zero mean : 3920.888
##  ARIMA(0,0,2) with zero mean     : 3934.093
##  ARIMA(0,0,2) with non-zero mean : 3840.657
##  ARIMA(0,0,3) with zero mean     : 3860.807
##  ARIMA(0,0,3) with non-zero mean : 3797.017
##  ARIMA(0,0,4) with zero mean     : 3837.916
##  ARIMA(0,0,4) with non-zero mean : 3790.669
##  ARIMA(0,0,5) with zero mean     : 3836.41
##  ARIMA(0,0,5) with non-zero mean : 3792.748
##  ARIMA(1,0,0) with zero mean     : 3817.559
##  ARIMA(1,0,0) with non-zero mean : 3802.401
##  ARIMA(1,0,1) with zero mean     : 3819.597
##  ARIMA(1,0,1) with non-zero mean : 3803.811
##  ARIMA(1,0,2) with zero mean     : 3813.72
##  ARIMA(1,0,2) with non-zero mean : 3792.356
##  ARIMA(1,0,3) with zero mean     : 3815.729
##  ARIMA(1,0,3) with non-zero mean : 3790.374
##  ARIMA(1,0,4) with zero mean     : Inf
##  ARIMA(1,0,4) with non-zero mean : 3792.409
##  ARIMA(2,0,0) with zero mean     : 3819.596
##  ARIMA(2,0,0) with non-zero mean : 3803.499
##  ARIMA(2,0,1) with zero mean     : 3820.105
##  ARIMA(2,0,1) with non-zero mean : 3805.808
##  ARIMA(2,0,2) with zero mean     : 3815.77
##  ARIMA(2,0,2) with non-zero mean : 3792.118
##  ARIMA(2,0,3) with zero mean     : 3817.549
##  ARIMA(2,0,3) with non-zero mean : 3792.451
##  ARIMA(3,0,0) with zero mean     : 3816.403
##  ARIMA(3,0,0) with non-zero mean : 3793.059
##  ARIMA(3,0,1) with zero mean     : 3818.062
##  ARIMA(3,0,1) with non-zero mean : 3795.013
##  ARIMA(3,0,2) with zero mean     : 3810.633
##  ARIMA(3,0,2) with non-zero mean : 3790.374
##  ARIMA(4,0,0) with zero mean     : 3817.132
##  ARIMA(4,0,0) with non-zero mean : 3794.901
##  ARIMA(4,0,1) with zero mean     : Inf
##  ARIMA(4,0,1) with non-zero mean : 3796.461
##  ARIMA(5,0,0) with zero mean     : 3811.272
##  ARIMA(5,0,0) with non-zero mean : 3794.64
## 
## 
## 
##  Best model: Regression with ARIMA(3,0,2) errors

##Product8 ARIMAX

product8_data[, `:=`(price_diff, price - shift(price, 7))]
product8_data[, `:=`(res, c(rep(NA, 7), as.numeric(product8_arima_model$residuals[1:365])))]

plot(product8_data[, list(price_diff, res)], col = day(product8_data$event_date))

product8_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product8_data[, list(visit_count_diff, res)], col = day(product8_data$event_date))

product8_data$basket_count
##   [1] 5096 1716 1782 1507 1646 1771 1740 1804 2464 2034 1769 2620 1919 2599 2579
##  [16]  560  486  431  376 1123  600  977  413  472  497  651  630  642  663 1053
##  [31] 1491 1980 3156 3031 3633 2837  806 1673 4463 2529  823  865 1042 1159 1150
##  [46] 1146 1427 1265 1096 1333 1684 1710 1455 2090 2742 3792 6408 2818  707  646
##  [61]  932 1233  828  966 1164  942 1156 1355 1797 2162  981  981  906  909  859
##  [76]  991  815  809  499  899 1235 2436 3186 4332 1826  726  605  637  576  618
##  [91]  614 1002  855  838  702 1010 1722 1857 1253  574  732  448  551  829  993
## [106] 1209 1326 1471 2042 1847  950  746  652  519  593  697  966 1417 1714  677
## [121]  377  373  336  309  489  318  252  233  121   77  266  567  413  323  313
## [136]  321  393  416  457  490  569  395  351  379  391  472  367  336  461  328
## [151]  428  234  269  217  245  226  244  263  810 1317 1547 2230 2123  787  543
## [166]  566  621  689  584  565  619  420  464  422  388  284  334  318  192  953
## [181] 1271 1410  235  507 1195 2387 2189 2116 1307  229  246  287  233  265  269
## [196]  222  268  585 1016 2066 2148 3157 3504 5473 5509  535  669  698  684  379
## [211] 1147  427  290  299  332  387  391  340  327  333  366  382  410  436  447
## [226]  551  501  576  541  588  638 2999  935 1429 2072 5364 3475 4797 1803  419
## [241]  425  406  538  541  400  412  371  420  412  546  628  993  796  309  307
## [256]  283  385  485  978  662  561  698 1574 2791 1939 4953 1504  343  346  280
## [271]  372  511  614  539  434  404  412  227  270  300  503  406  514  951 3808
## [286] 3223 3126 1313  576  487  586  618  837 1298 1110  543  555  569  733 1085
## [301]  618  503  359  277  196  364  627  509 1089 2332 1357  722  519  944  511
## [316]  530  397  391  176  360  361  253  462  428  523  856 1230  712 1517  598
## [331]  329  414  206  317  297  259  284  234  279  342  309  256  260  243  223
## [346]  341  602  764 1045 2101  597  380  350  314  300  377  311  362  274  312
## [361]  313  431  806  312  344  332  292  296  340  399  330  307
product8_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product8_data[, list(basket_count_diff, res)], col = day(product8_data$event_date))

product8_data$favored_count
##   [1] 1703  494  509  405  397  409  366  376  485  375  332  465  309  562  581
##  [16]  181  146  121  191  631  158  229  106  104  124  179  158  208  222  320
##  [31]  324  437  820  830 1579 1050  489  791 1729  493  259  327  320  276  314
##  [46]  317  281  303  340  324  408  401  293  530  760 1244 2286  714  240  233
##  [61]  334  351  257  352  430  257  292  327  454  597  354  307  219  223  261
##  [76]  234  237  244  248  179  332  657  966 1706  750  370  340  372  220  189
##  [91]  281  237  319  337  253  285  585  762  317  308  377  228  176  315  414
## [106]  404  328  355  539  627  328  248  205  176  192  257  399  583 1068  617
## [121]  303  294  219  115  148  126  121   97   61   37  137  188  151  121  147
## [136]  134  150  209  225  336  424  187  174  197  207  261  268  187  205  193
## [151]  149  135  144  126  164  166  153  206  325  602  524  794  811  410  418
## [166]  245  273  290  291  273  304  201  253  252  239  143  214  196  137  117
## [181]  143  162  124    0    0    0    0    0    0    0    0    0    0    0    0
## [196]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [211]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [226]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [241]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [256]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [271]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [286]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [301]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [316]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [331]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [346]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [361]    0    0    0    0    0    0    0    0    0    0    0    0
product8_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product8_data[, list(favored_count_diff, res)], col = day(product8_data$event_date))

product8_data$category_sold
##   [1]  6547  3691  3604  1598  1775  1948  1751  1984  2603  2543  2764  4427
##  [13]  2036  3005  2726  1911  1757  1446   990  1519  1815  2442  1415  1351
##  [25]  1697  1708  1654  1408  1533  1790  1984  2691  4215  3899  4406  2539
##  [37]  1233  1434  2773  2081  1356  1497  1767  1903  1786  1939  2363  1762
##  [49]  1633  2108  2270  1919  1777  2577  3587  4058  5182  2557  1302  1134
##  [61]  1351  1656  1248  1380  1735  1377  1847  2252  3313  2949  1753  1352
##  [73]  1043  1060  1161  1503  1185  1296  1524  1150  1814  3585  3934  4667
##  [85]  1800   887   703   866  1048  1181  1194  1465  1856  1588  1426  1767
##  [97]  2841  2664  2257  1554  1412  1231  1338  1737  1935  2106  1748  1740
## [109]  1702  1669  1438  1380  1415  1919  1961  2544  3996  4427  5175  2044
## [121]  1119  1074   999  1060  1223  1291  1244  1294  1179   965  1265  1721
## [133]  1580  1686  1680  1758  2034  2222  2258  2297  1578  1435  1422  1458
## [145]  2043  2378  1239  1124  1033   942   958   697   829   813   885   819
## [157]   878   751  1956  5332  6392  3745  1738   896   978  1230  1504  1835
## [169]  1392  1523  1565   867   939  1059   957   905   976   875   631   624
## [181]   601   627   623   805  2738  7031  6391  8825  3328   617   591   562
## [193]   730   745   727   773   798   914  1070  2681  6481  8392  8003 13114
## [205]  3652   793  1311   957   990  1016  1096  1026   959   844   875   958
## [217]  1357  2040   619   667   729   867  1083   979   913   964   830   877
## [229]  1057   818   983  2948  1158  1334  2731  8856  6453  6885  2276   942
## [241]   850  1006  1115  1147   915   907   801   731   707  1444  1463  2045
## [253]  1453   722   673   738   962  1141  1825  1100   643  1034  2197  4346
## [265]  4280  5700  1472   608   627   636   812  1112   850   902   721   641
## [277]   675   582   643   745   809   668   721  1541  7639  8167  4888  1603
## [289]   934   980   983  1258  3082  3007  2259  1021  1168   819  1169  1445
## [301]   827   810   500   323   234   456   605   702  1203  2185  1954   954
## [313]   894  1205  1047   932   686   705   588   776   600   547   955  1178
## [325]   855  1967  3587  2419  4039  1281   404   475   443   547   487   499
## [337]   451   401   342   419   408   404   435   465   348   413   648   858
## [349]  1331  1659   700   450   402   401   446   458   504   517   379   387
## [361]   438   483   855   503   861   518   494   473   550   562   486   413
product8_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    7))]
plot(product8_data[, list(category_sold_diff, res)], col = day(product8_data$event_date))

Category_sold_diff could be added to the ARIMA model.

product8_data[is.na(category_sold_diff) == T, `:=`(category_sold_diff, 
    0)]
category_sold_mat_p8 <- matrix(product8_data$category_sold_diff)
arimax_product8 = auto.arima(product7_data$sold_count, xreg = category_sold_mat_p8, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,1,0)                    : 3792.455
##  Regression with ARIMA(0,1,0) errors : 3794.486
##  ARIMA(0,1,1)                    : 3789.502
##  Regression with ARIMA(0,1,1) errors : 3791.542
##  ARIMA(0,1,2)                    : 3785.631
##  Regression with ARIMA(0,1,2) errors : 3787.677
##  ARIMA(0,1,3)                    : 3750.039
##  Regression with ARIMA(0,1,3) errors : 3751.343
##  ARIMA(0,1,4)                    : 3737.334
##  Regression with ARIMA(0,1,4) errors : Inf
##  ARIMA(0,1,5)                    : 3737.849
##  Regression with ARIMA(0,1,5) errors : Inf
##  ARIMA(1,1,0)                    : 3790.03
##  Regression with ARIMA(1,1,0) errors : 3792.071
##  ARIMA(1,1,1)                    : 3741.426
##  Regression with ARIMA(1,1,1) errors : Inf
##  ARIMA(1,1,2)                    : 3742.646
##  Regression with ARIMA(1,1,2) errors : Inf
##  ARIMA(1,1,3)                    : 3736.762
##  Regression with ARIMA(1,1,3) errors : Inf
##  ARIMA(1,1,4)                    : 3737.874
##  Regression with ARIMA(1,1,4) errors : Inf
##  ARIMA(2,1,0)                    : 3791.643
##  Regression with ARIMA(2,1,0) errors : 3793.694
##  ARIMA(2,1,1)                    : 3742.302
##  Regression with ARIMA(2,1,1) errors : Inf
##  ARIMA(2,1,2)                    : 3743.93
##  Regression with ARIMA(2,1,2) errors : Inf
##  ARIMA(2,1,3)                    : 3738.106
##  Regression with ARIMA(2,1,3) errors : Inf
##  ARIMA(3,1,0)                    : 3780.262
##  Regression with ARIMA(3,1,0) errors : 3782.323
##  ARIMA(3,1,1)                    : 3737.265
##  Regression with ARIMA(3,1,1) errors : Inf
##  ARIMA(3,1,2)                    : 3739.262
##  Regression with ARIMA(3,1,2) errors : Inf
##  ARIMA(4,1,0)                    : 3768.255
##  Regression with ARIMA(4,1,0) errors : 3770.326
##  ARIMA(4,1,1)                    : 3739.184
##  Regression with ARIMA(4,1,1) errors : Inf
##  ARIMA(5,1,0)                    : 3764.502
##  Regression with ARIMA(5,1,0) errors : 3766.582
## 
## 
## 
##  Best model: Regression with ARIMA(1,1,3) errors

##Product9 ARIMAX

product9_data[, `:=`(price_diff, price - shift(price, 7))]
product9_data[, `:=`(res, c(rep(NA, 7), as.numeric(product9_arima_model$residuals[1:365])))]

plot(product9_data[, list(price_diff, res)], col = day(product9_data$event_date))

product9_data[, `:=`(visit_count_diff, visit_count - shift(visit_count, 
    7))]
plot(product9_data[, list(visit_count_diff, res)], col = day(product9_data$event_date))

product9_data[, `:=`(basket_count_diff, basket_count - shift(basket_count, 
    7))]
plot(product9_data[, list(basket_count_diff, res)], col = day(product9_data$event_date))

product9_data$favored_count
##   [1]  698  609  624  608  869  859  525  524  745 1001  812  801  651  834  964
##  [16] 1200 1160  936 1024 1465 2076 2448 1780 1210 1262  840  738  768 1394  983
##  [31]  885  941  790 1080 1093 1559 1685 1560 1706  593  899  795  567  927  655
##  [46] 1148 1388  546  454  718  870  671  474  564 1086 2119 2696 1920  747  529
##  [61]  929  872  789  748  674  437  352  239  175  548  457  317  235  345  493
##  [76]  443  948  181  159  181  381  973 1427 1528 1511 1106 1645 2085  633  755
##  [91] 1599  873  940  551  417  452  674 1326  676  689  621  519  302  346  281
## [106]  331  432  342  290  220  269  296  728  968  436  307  442  614  720  408
## [121]  319  212  186   78  162  261  237  222  160  187  353  510  557  465  588
## [136]  518  921  996 1155 1513  802  676  688  507  446  396  199  101  228  229
## [151]  168   86   88  121  105  180  216  244  745 1990 1213  541  255  290  233
## [166]  275  261  221  212  344  435  298  277  299  463  692 1395  788  269  209
## [181]  191  145  117    0    0    0    0    0    0    0    0    0    0    0    0
## [196]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [211]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [226]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [241]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [256]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [271]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [286]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [301]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [316]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [331]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [346]    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
## [361]    0    0    0    0    0    0    0    0    0    0    0    0
product9_data[, `:=`(favored_count_diff, favored_count - shift(favored_count, 
    7))]
plot(product9_data[, list(favored_count_diff, res)], col = day(product9_data$event_date))

product9_data$category_sold
##   [1]  930  911  895  807  800  835  698  778  806  926  806  842  802 1030 1180
##  [16] 1304 1142 1048  981 1107 1398 1537 1341 1358 1437 1485 1338 1111 1159  949
##  [31] 1090 1576 2078 2144 2006 1042  836  971 1049  709 1023  928 1017  968  953
##  [46] 1063 1290  943  844  884 1121  939  928 1265 1766 1866 1768 1271  693  613
##  [61]  829  899  741  767  819  685  726  847 1280 1384  816  735  729  872  904
##  [76] 1077 1109  966  863  801 1539 3132 4031 2462 1252  683  598  615  640  970
##  [91]  909  726  705  732  638  719 1186 1218 1081 1080 1128 1091  984 1281 1241
## [106] 1311 1379 1675 1479 1260 1160 1115 1099 1197 1311 2121 3691 3891 2792 1193
## [121]  652  650  610  668  767  930  979  855  856  718  757  772  826  760  916
## [136]  953 1207 1319 1795 1551  939  908  904  893  833  918  778  720  717  659
## [151]  596  551  548  610  600  724  825  867 1713 3414 3413 3043  829  697  707
## [166]  836  917  984  899 1147 1239  834  684  737  706  778  994 1004  659  627
## [181]  656  512  576  845 2247 5557 4145 5139 1436  477  497  408  437  419  488
## [196]  529  638  753  779 1951 4472 4908 3663 4708 1509  329  399  366  448  479
## [211]  452  429  350  344  406  391  418  434  421  454  419  514  559  560  623
## [226]  661  558  605  632  747  544  620  617  625 1301 2527 2103 2554 1073  536
## [241]  406  480  625  736  708  740  703  761  789  882 1142 1233 1528  641  668
## [256]  624  746  783  951  804  607  721 1531 4096 3744 3185 1018  560  469  500
## [271]  548  553  800  770  431  415  481  571  590  652  663  560  629 1218 2986
## [286] 1770 1799 1118  835  625  593  425  472  480  505  455  520  498  582  626
## [301]  733  807  659  548  445  560  648  433  378  598  553  504  400  483  525
## [316]  466  442  361  514  596  539  468  456  452  480  742 1524 1271 1486  587
## [331]  321  360  646  507  444  461  485  500  441  389  459  559  563  520  543
## [346]  597  962 1438 1291 1437  962  718  662  833  907 1021  799  778  712  725
## [361]  823  981 1210  809  809  902  842  810  927 1071 1351 1193
product9_data[, `:=`(category_sold_diff, category_sold - shift(category_sold, 
    7))]
plot(product9_data[, list(category_sold_diff, res)], col = day(product9_data$event_date))

product9_data$category_brand_sold
##   [1]   875   866   857  3370  3946  3769  3019  3144  4260  5373  4010  3587
##  [13]  3444  5287  4543  5925  5366  4884  4265  4992  6454  6717  5763  5658
##  [25]  5647  5043  5341  4254  6758  5354  5412  8649  9786 15283  9991  7894
##  [37]  7579  9130  9528  3763  4897  4380  4702  7122  5424  5227  6516  4603
##  [49]  3285  3862  5379  4182  3513  3898  6101 10600  8913  7840  3770  2818
##  [61]  3241  3537  4709  4997  4572  3138  3631  4388  5156  6120  4295  4467
##  [73]  3935  4887  5470  5329  5887  5608  4245  3123  5821 12868 16442 11074
##  [85]  8138  6196  6732  6619  5442  5878  8359  5416  5346  4345  3986  4346
##  [97]  7195  8083  7374  9184  7628  7868  6117  6621  6542  7542  9416 12180
## [109]  9697  8022  6420  5476  7309  8772  9898  9637 18425 28944 17260  9761
## [121]  8319  6084  5360  3574  4776  7846  7310  7446  4938  4521  4868  4851
## [133]  4091  3755  5106  5005  6628  6611  9282  8788  7464  6527  6083  5639
## [145]  6919  9471  6224  5068  7733  6872  5040  3380  3673  4732  4541  6005
## [157]  5854  5649 10710 21292 23931 17068  9159  6429  5016  5609  5089  5421
## [169]  5345  6095  7110  4513  4651  4271  4232  5039  7355  6527  5288  7799
## [181]  4276  3441  3989     0     0     0     0     0     0     0     0     0
## [193]     0     0     0     0     0     0     0     0     0     0     0     0
## [205]     0     0     0     0     0     0     0     0     0     0     0     0
## [217]     0     0     0     0     0     0     0     0     0     0     0     0
## [229]     0     0     0     0     0     0     0     0     0     0     0     0
## [241]     0     0     0     0     0     0     0     0     0     0     0     0
## [253]     0     0     0     0     0     0     0     0     0     0     0     0
## [265]     0     0     0     0     0     0     0     0     0     0     0     0
## [277]     0     0     0     0     0     0     0     0     0     0     0     0
## [289]     0     0     0     0     0     0     0     0     0     0     0     0
## [301]     0     0     0     0     0     0     0     0     0     0     0     0
## [313]     0     0     0     0     0     0     0     0     0     0     0     0
## [325]     0     0     0     0     0     0     0     0     0     0     0     0
## [337]     0     0     0     0     0     0     0     0     0     0     0     0
## [349]     0     0     0     0     0     0     0     0     0     0     0     0
## [361]     0     0     0     0     0     0     0     0     0     0     0     0
product9_data[, `:=`(category_brand_sold_diff, category_brand_sold - 
    shift(category_brand_sold, 7))]
plot(product9_data[, list(category_brand_sold_diff, res)], col = day(product9_data$event_date))

We can use price_diff as a possible regressor.

product9_data[is.na(price_diff) == T, `:=`(price_diff, 0)]
price_diff_mat_p9 <- matrix(product9_data$price_diff)
arimax_product9 = auto.arima(product9_data$sold_count, xreg = price_diff_mat_p9, 
    seasonal = T, trace = T, stepwise = F, approximation = F)
## 
##  ARIMA(0,1,0)                    : 3860.38
##  Regression with ARIMA(0,1,0) errors : 3862.407
##  ARIMA(0,1,1)                    : 3856.398
##  Regression with ARIMA(0,1,1) errors : 3858.433
##  ARIMA(0,1,2)                    : 3841.621
##  Regression with ARIMA(0,1,2) errors : 3843.648
##  ARIMA(0,1,3)                    : 3837.92
##  Regression with ARIMA(0,1,3) errors : 3839.925
##  ARIMA(0,1,4)                    : 3835.305
##  Regression with ARIMA(0,1,4) errors : 3837.227
##  ARIMA(0,1,5)                    : 3833.391
##  Regression with ARIMA(0,1,5) errors : 3834.468
##  ARIMA(1,1,0)                    : 3858.434
##  Regression with ARIMA(1,1,0) errors : 3860.471
##  ARIMA(1,1,1)                    : 3828.174
##  Regression with ARIMA(1,1,1) errors : 3827.908
##  ARIMA(1,1,2)                    : 3829.703
##  Regression with ARIMA(1,1,2) errors : 3829.588
##  ARIMA(1,1,3)                    : 3831.471
##  Regression with ARIMA(1,1,3) errors : 3831.228
##  ARIMA(1,1,4)                    : 3833.551
##  Regression with ARIMA(1,1,4) errors : 3833.253
##  ARIMA(2,1,0)                    : 3851.833
##  Regression with ARIMA(2,1,0) errors : 3853.879
##  ARIMA(2,1,1)                    : 3829.744
##  Regression with ARIMA(2,1,1) errors : 3829.624
##  ARIMA(2,1,2)                    : 3832.095
##  Regression with ARIMA(2,1,2) errors : 3831.843
##  ARIMA(2,1,3)                    : 3833.378
##  Regression with ARIMA(2,1,3) errors : 3833.317
##  ARIMA(3,1,0)                    : 3850.398
##  Regression with ARIMA(3,1,0) errors : 3852.453
##  ARIMA(3,1,1)                    : 3831.545
##  Regression with ARIMA(3,1,1) errors : 3831.324
##  ARIMA(3,1,2)                    : 3833.47
##  Regression with ARIMA(3,1,2) errors : 3833.396
##  ARIMA(4,1,0)                    : 3843.654
##  Regression with ARIMA(4,1,0) errors : 3845.709
##  ARIMA(4,1,1)                    : 3836.206
##  Regression with ARIMA(4,1,1) errors : 3838.235
##  ARIMA(5,1,0)                    : 3836.589
##  Regression with ARIMA(5,1,0) errors : 3838.636
## 
## 
## 
##  Best model: Regression with ARIMA(1,1,1) errors

#Forecast

To compare arima and arimax models, we compare them for test period which is from 2021-05-11 to 2021-05-29.

train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")
train_date = seq(train_start, test_start, by = "day")
str(train_date)
##  Date[1:352], format: "2020-05-25" "2020-05-26" "2020-05-27" "2020-05-28" "2020-05-29" ...
test_dates = seq(test_start, test_end, by = "day")
test_dates
##  [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
##  [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"

##Product 1

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(0, 0, 1), seasonal = c(0, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product1

    # forecasted=forecast(fitted,xreg=price_diff_mat_p1[239],
    # h=nahead)
    forecasted = forecast(fitted, xreg = tail(price_diff_mat_p1[!is.na(price_diff_mat_p1)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product1_data[event_date <= current_date]
    forecast_data = product1_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable        value
##  1: 2021-05-11          3  arima_prediction 5.404704e-01
##  2: 2021-05-12          5  arima_prediction 5.479946e-01
##  3: 2021-05-13          1  arima_prediction 5.559558e-01
##  4: 2021-05-14          1  arima_prediction 5.531254e-01
##  5: 2021-05-15          1  arima_prediction 5.546790e-01
##  6: 2021-05-16          1  arima_prediction 5.548164e-01
##  7: 2021-05-17          3  arima_prediction 5.552911e-01
##  8: 2021-05-18          2  arima_prediction 5.616536e-01
##  9: 2021-05-19          1  arima_prediction 5.621574e-01
## 10: 2021-05-20          0  arima_prediction 5.620824e-01
## 11: 2021-05-21          1  arima_prediction 5.598573e-01
## 12: 2021-05-22          0  arima_prediction 5.612916e-01
## 13: 2021-05-23          0  arima_prediction 5.587257e-01
## 14: 2021-05-24          3  arima_prediction 5.574481e-01
## 15: 2021-05-25          2  arima_prediction 5.645349e-01
## 16: 2021-05-26          4  arima_prediction 5.647959e-01
## 17: 2021-05-27          1  arima_prediction 5.728619e-01
## 18: 2021-05-28          1  arima_prediction 5.706718e-01
## 19: 2021-05-29          3  arima_prediction 5.719171e-01
## 20: 2021-05-11          3 arimax_prediction 1.538306e-06
## 21: 2021-05-12          5 arimax_prediction 1.538306e-06
## 22: 2021-05-13          1 arimax_prediction 1.538306e-06
## 23: 2021-05-14          1 arimax_prediction 1.538306e-06
## 24: 2021-05-15          1 arimax_prediction 1.538306e-06
## 25: 2021-05-16          1 arimax_prediction 1.538306e-06
## 26: 2021-05-17          3 arimax_prediction 1.538306e-06
## 27: 2021-05-18          2 arimax_prediction 1.538306e-06
## 28: 2021-05-19          1 arimax_prediction 1.538306e-06
## 29: 2021-05-20          0 arimax_prediction 1.538306e-06
## 30: 2021-05-21          1 arimax_prediction 1.538306e-06
## 31: 2021-05-22          0 arimax_prediction 1.538306e-06
## 32: 2021-05-23          0 arimax_prediction 1.538306e-06
## 33: 2021-05-24          3 arimax_prediction 1.538306e-06
## 34: 2021-05-25          2 arimax_prediction 1.538306e-06
## 35: 2021-05-26          4 arimax_prediction 1.538306e-06
## 36: 2021-05-27          1 arimax_prediction 1.538306e-06
## 37: 2021-05-28          1 arimax_prediction 1.538306e-06
## 38: 2021-05-29          3 arimax_prediction 1.538306e-06
##     event_date sold_count          variable        value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias MAPE     RMSE
## 1:  arima_prediction 19 1.736842 1.407997 0.8106651 0.6778688  Inf 1.808265
## 2: arimax_prediction 19 1.736842 1.407997 0.8106651 0.9999991  Inf 2.212404
##         MAD      MADP     WMAPE
## 1: 1.354414 0.7798142 0.7798142
## 2: 1.736841 0.9999994 0.9999994

View of data shows that arimax prediction is better in terms of WMAPE.

##Product 2

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(2, 0, 0), seasonal = c(2, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product2

    forecasted = forecast(fitted, xreg = tail(visit_count_diff_mat_p2[!is.na(visit_count_diff_mat_p2)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product2_data[event_date <= current_date]
    forecast_data = product2_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable         value
##  1: 2021-05-11        112  arima_prediction  3.976428e-01
##  2: 2021-05-12        202  arima_prediction  4.212251e-01
##  3: 2021-05-13        142  arima_prediction  3.379453e-01
##  4: 2021-05-14        170  arima_prediction  4.359351e-01
##  5: 2021-05-15        242  arima_prediction  4.414281e-01
##  6: 2021-05-16        173  arima_prediction  1.037243e+00
##  7: 2021-05-17        107  arima_prediction  1.324535e-01
##  8: 2021-05-18        133  arima_prediction  5.940787e-01
##  9: 2021-05-19        109  arima_prediction  6.176498e-01
## 10: 2021-05-20        102  arima_prediction  6.170576e-01
## 11: 2021-05-21        101  arima_prediction  6.027053e-01
## 12: 2021-05-22        186  arima_prediction  1.382863e+00
## 13: 2021-05-23        231  arima_prediction  7.417304e-01
## 14: 2021-05-24        181  arima_prediction  6.425408e-01
## 15: 2021-05-25        177  arima_prediction  6.438554e-01
## 16: 2021-05-26        172  arima_prediction  3.742800e-01
## 17: 2021-05-27        214  arima_prediction  8.557031e-01
## 18: 2021-05-28        194  arima_prediction  4.385175e-01
## 19: 2021-05-29        228  arima_prediction  7.804554e-01
## 20: 2021-05-11        112 arimax_prediction -1.940188e-19
## 21: 2021-05-12        202 arimax_prediction -1.940188e-19
## 22: 2021-05-13        142 arimax_prediction -1.940188e-19
## 23: 2021-05-14        170 arimax_prediction -1.940188e-19
## 24: 2021-05-15        242 arimax_prediction -1.940188e-19
## 25: 2021-05-16        173 arimax_prediction -1.940188e-19
## 26: 2021-05-17        107 arimax_prediction -1.940188e-19
## 27: 2021-05-18        133 arimax_prediction -1.940188e-19
## 28: 2021-05-19        109 arimax_prediction -1.940188e-19
## 29: 2021-05-20        102 arimax_prediction -1.940188e-19
## 30: 2021-05-21        101 arimax_prediction -1.940188e-19
## 31: 2021-05-22        186 arimax_prediction -1.940188e-19
## 32: 2021-05-23        231 arimax_prediction -1.940188e-19
## 33: 2021-05-24        181 arimax_prediction -1.940188e-19
## 34: 2021-05-25        177 arimax_prediction -1.940188e-19
## 35: 2021-05-26        172 arimax_prediction -1.940188e-19
## 36: 2021-05-27        214 arimax_prediction -1.940188e-19
## 37: 2021-05-28        194 arimax_prediction -1.940188e-19
## 38: 2021-05-29        228 arimax_prediction -1.940188e-19
##     event_date sold_count          variable         value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias      MAPE
## 1:  arima_prediction 19 167.1579 46.35762 0.2773284 0.9963806 0.9962379
## 2: arimax_prediction 19 167.1579 46.35762 0.2773284 1.0000000 1.0000000
##        RMSE      MAD      MADP     WMAPE
## 1: 172.5354 166.5529 0.9963806 0.9963806
## 2: 173.1406 167.1579 1.0000000 1.0000000

View of data shows that arima prediction is better in terms of MAPE.

#Product 3

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(2, 0, 0), seasonal = c(2, 
        0, 2))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product3

    forecasted = forecast(fitted, xreg = tail(basket_count_mat_p3[!is.na(basket_count_mat_p3)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product3_data[event_date <= current_date]
    forecast_data = product3_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11         35  arima_prediction 37.60139
##  2: 2021-05-12         53  arima_prediction 38.79232
##  3: 2021-05-13         31  arima_prediction 38.02713
##  4: 2021-05-14         59  arima_prediction 38.80118
##  5: 2021-05-15         66  arima_prediction 38.62039
##  6: 2021-05-16         46  arima_prediction 39.33641
##  7: 2021-05-17         24  arima_prediction 37.92912
##  8: 2021-05-18         37  arima_prediction 37.77184
##  9: 2021-05-19         40  arima_prediction 37.76956
## 10: 2021-05-20         37  arima_prediction 37.77880
## 11: 2021-05-21         34  arima_prediction 37.69812
## 12: 2021-05-22         41  arima_prediction 37.68953
## 13: 2021-05-23         45  arima_prediction 37.69094
## 14: 2021-05-24         31  arima_prediction 39.04140
## 15: 2021-05-25         40  arima_prediction 37.60958
## 16: 2021-05-26         46  arima_prediction 37.66310
## 17: 2021-05-27         41  arima_prediction 37.63778
## 18: 2021-05-28         39  arima_prediction 39.03608
## 19: 2021-05-29         55  arima_prediction 37.59124
## 20: 2021-05-11         35 arimax_prediction 39.21218
## 21: 2021-05-12         53 arimax_prediction 39.21218
## 22: 2021-05-13         31 arimax_prediction 39.21218
## 23: 2021-05-14         59 arimax_prediction 39.21218
## 24: 2021-05-15         66 arimax_prediction 39.21218
## 25: 2021-05-16         46 arimax_prediction 39.21218
## 26: 2021-05-17         24 arimax_prediction 39.21218
## 27: 2021-05-18         37 arimax_prediction 39.21218
## 28: 2021-05-19         40 arimax_prediction 39.21218
## 29: 2021-05-20         37 arimax_prediction 39.21218
## 30: 2021-05-21         34 arimax_prediction 39.21218
## 31: 2021-05-22         41 arimax_prediction 39.21218
## 32: 2021-05-23         45 arimax_prediction 39.21218
## 33: 2021-05-24         31 arimax_prediction 39.21218
## 34: 2021-05-25         40 arimax_prediction 39.21218
## 35: 2021-05-26         46 arimax_prediction 39.21218
## 36: 2021-05-27         41 arimax_prediction 39.21218
## 37: 2021-05-28         39 arimax_prediction 39.21218
## 38: 2021-05-29         55 arimax_prediction 39.21218
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV      FBias      MAPE
## 1:  arima_prediction 19 42.10526 10.39174 0.2468039 0.09489262 0.1789984
## 2: arimax_prediction 19 42.10526 10.39174 0.2468039 0.06871078 0.1790068
##        RMSE      MAD      MADP     WMAPE
## 1: 10.74232 7.877993 0.1871023 0.1871023
## 2: 10.52020 7.703340 0.1829543 0.1829543

View of data shows that arima prediction is better in terms of MAPE.

#Product 4

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(0, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product4

    forecasted = forecast(fitted, xreg = tail(category_sold_diff_mat_p4[!is.na(category_sold_diff_mat_p4)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product4_data[event_date <= current_date]
    forecast_data = product4_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11        363  arima_prediction 939.5411
##  2: 2021-05-12        522  arima_prediction 939.2181
##  3: 2021-05-13        560  arima_prediction 938.9446
##  4: 2021-05-14        512  arima_prediction 938.4145
##  5: 2021-05-15        550  arima_prediction 938.4417
##  6: 2021-05-16        530  arima_prediction 937.9748
##  7: 2021-05-17        293  arima_prediction 937.5866
##  8: 2021-05-18        301  arima_prediction 937.3904
##  9: 2021-05-19        273  arima_prediction 936.6304
## 10: 2021-05-20        255  arima_prediction 936.0824
## 11: 2021-05-21        229  arima_prediction 935.6306
## 12: 2021-05-22        261  arima_prediction 935.0349
## 13: 2021-05-23        284  arima_prediction 934.4551
## 14: 2021-05-24        240  arima_prediction 934.0283
## 15: 2021-05-25        216  arima_prediction 933.6112
## 16: 2021-05-26        236  arima_prediction 932.9394
## 17: 2021-05-27        281  arima_prediction 932.3776
## 18: 2021-05-28        240  arima_prediction 931.9122
## 19: 2021-05-29        316  arima_prediction 931.4979
## 20: 2021-05-11        363 arimax_prediction 411.5221
## 21: 2021-05-12        522 arimax_prediction 411.5221
## 22: 2021-05-13        560 arimax_prediction 411.5221
## 23: 2021-05-14        512 arimax_prediction 411.5221
## 24: 2021-05-15        550 arimax_prediction 411.5221
## 25: 2021-05-16        530 arimax_prediction 411.5221
## 26: 2021-05-17        293 arimax_prediction 411.5221
## 27: 2021-05-18        301 arimax_prediction 411.5221
## 28: 2021-05-19        273 arimax_prediction 411.5221
## 29: 2021-05-20        255 arimax_prediction 411.5221
## 30: 2021-05-21        229 arimax_prediction 411.5221
## 31: 2021-05-22        261 arimax_prediction 411.5221
## 32: 2021-05-23        284 arimax_prediction 411.5221
## 33: 2021-05-24        240 arimax_prediction 411.5221
## 34: 2021-05-25        216 arimax_prediction 411.5221
## 35: 2021-05-26        236 arimax_prediction 411.5221
## 36: 2021-05-27        281 arimax_prediction 411.5221
## 37: 2021-05-28        240 arimax_prediction 411.5221
## 38: 2021-05-29        316 arimax_prediction 411.5221
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV      FBias      MAPE
## 1:  arima_prediction 19 340.1053 124.5498 0.3662096 -1.7517350 2.0559821
## 2: arimax_prediction 19 340.1053 124.5498 0.3662096 -0.2099844 0.4654088
##        RMSE      MAD      MADP     WMAPE
## 1: 607.6258 595.7743 1.7517350 1.7517350
## 2: 140.7002 136.2999 0.4007581 0.4007581

View of data shows that arimax prediction is better in terms of MAPE.

#Product 5

We define our functions for arima and arimax models.

product5_arima_model
## Series: product5_dec_daily$random 
## ARIMA(1,0,1)(1,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     sar1    sma1
##       -0.0229  0.2056  -0.1856  0.2837
## s.e.   0.0530  0.0491   0.1669  0.1632
## 
## sigma^2 estimated as 15049:  log likelihood=-16142.25
## AIC=32294.49   AICc=32294.52   BIC=32323.79
arimax_product5
## Series: product5_data$sold_count 
## Regression with ARIMA(5,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5  intercept    xreg
##       1.6023  -1.2325  0.7710  -0.4495  0.1912   393.3038  0.6316
## s.e.  0.0510   0.0952  0.1075   0.0948  0.0509    26.6493  0.0081
## 
## sigma^2 estimated as 3821:  log likelihood=-2059.88
## AIC=4135.75   AICc=4136.15   BIC=4167.1
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(1, 0, 1), seasonal = c(1, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product5

    forecasted = forecast(fitted, xreg = tail(residuals_mat_p5[!is.na(residuals_mat_p5)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product5_data[event_date <= current_date]
    forecast_data = product5_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11        149  arima_prediction 509.9902
##  2: 2021-05-12        389  arima_prediction 506.8503
##  3: 2021-05-13        511  arima_prediction 508.3720
##  4: 2021-05-14        610  arima_prediction 507.9804
##  5: 2021-05-15        473  arima_prediction 507.9171
##  6: 2021-05-16        496  arima_prediction 507.7354
##  7: 2021-05-17        440  arima_prediction 507.9670
##  8: 2021-05-18        371  arima_prediction 507.8527
##  9: 2021-05-19        303  arima_prediction 507.4218
## 10: 2021-05-20        396  arima_prediction 507.2979
## 11: 2021-05-21        178  arima_prediction 507.4749
## 12: 2021-05-22        535  arima_prediction 507.3865
## 13: 2021-05-23        562  arima_prediction 508.6232
## 14: 2021-05-24        384  arima_prediction 508.3583
## 15: 2021-05-25        280  arima_prediction 507.9776
## 16: 2021-05-26        233  arima_prediction 507.1355
## 17: 2021-05-27        268  arima_prediction 506.7011
## 18: 2021-05-28        258  arima_prediction 506.6841
## 19: 2021-05-29        286  arima_prediction 506.7034
## 20: 2021-05-11        149 arimax_prediction 437.6150
## 21: 2021-05-12        389 arimax_prediction 437.6150
## 22: 2021-05-13        511 arimax_prediction 437.6150
## 23: 2021-05-14        610 arimax_prediction 437.6150
## 24: 2021-05-15        473 arimax_prediction 437.6150
## 25: 2021-05-16        496 arimax_prediction 437.6150
## 26: 2021-05-17        440 arimax_prediction 437.6150
## 27: 2021-05-18        371 arimax_prediction 437.6150
## 28: 2021-05-19        303 arimax_prediction 437.6150
## 29: 2021-05-20        396 arimax_prediction 437.6150
## 30: 2021-05-21        178 arimax_prediction 437.6150
## 31: 2021-05-22        535 arimax_prediction 437.6150
## 32: 2021-05-23        562 arimax_prediction 437.6150
## 33: 2021-05-24        384 arimax_prediction 437.6150
## 34: 2021-05-25        280 arimax_prediction 437.6150
## 35: 2021-05-26        233 arimax_prediction 437.6150
## 36: 2021-05-27        268 arimax_prediction 437.6150
## 37: 2021-05-28        258 arimax_prediction 437.6150
## 38: 2021-05-29        286 arimax_prediction 437.6150
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV      FBias      MAPE
## 1:  arima_prediction 19 374.8421 133.5021 0.3561555 -0.3544551 0.6006880
## 2: arimax_prediction 19 374.8421 133.5021 0.3561555 -0.1674650 0.4587804
##        RMSE      MAD      MADP     WMAPE
## 1: 185.7820 152.4055 0.4065859 0.4065859
## 2: 144.3094 122.1092 0.3257618 0.3257618

View of data shows that arima prediction is better in terms of MAPE. #Product 6

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(1, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product6

    forecasted = forecast(fitted, xreg = tail(basket_count_diff_mat_p6[!is.na(basket_count_diff_mat_p6)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product7_data[event_date <= current_date]
    forecast_data = product7_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11        126  arima_prediction 49.75387
##  2: 2021-05-12        134  arima_prediction 49.75066
##  3: 2021-05-13        170  arima_prediction 49.82353
##  4: 2021-05-14        178  arima_prediction 49.88569
##  5: 2021-05-15        222  arima_prediction 49.85985
##  6: 2021-05-16         98  arima_prediction 50.05962
##  7: 2021-05-17         72  arima_prediction 50.04828
##  8: 2021-05-18         94  arima_prediction 50.00268
##  9: 2021-05-19         85  arima_prediction 50.03672
## 10: 2021-05-20         77  arima_prediction 50.04390
## 11: 2021-05-21         95  arima_prediction 49.87351
## 12: 2021-05-22        104  arima_prediction 49.92735
## 13: 2021-05-23        114  arima_prediction 49.88775
## 14: 2021-05-24        146  arima_prediction 49.91781
## 15: 2021-05-25         85  arima_prediction 50.08544
## 16: 2021-05-26        125  arima_prediction 50.03131
## 17: 2021-05-27         71  arima_prediction 50.10780
## 18: 2021-05-28         52  arima_prediction 50.11470
## 19: 2021-05-29         67  arima_prediction 50.05460
## 20: 2021-05-11        126 arimax_prediction 40.94258
## 21: 2021-05-12        134 arimax_prediction 40.94258
## 22: 2021-05-13        170 arimax_prediction 40.94258
## 23: 2021-05-14        178 arimax_prediction 40.94258
## 24: 2021-05-15        222 arimax_prediction 40.94258
## 25: 2021-05-16         98 arimax_prediction 40.94258
## 26: 2021-05-17         72 arimax_prediction 40.94258
## 27: 2021-05-18         94 arimax_prediction 40.94258
## 28: 2021-05-19         85 arimax_prediction 40.94258
## 29: 2021-05-20         77 arimax_prediction 40.94258
## 30: 2021-05-21         95 arimax_prediction 40.94258
## 31: 2021-05-22        104 arimax_prediction 40.94258
## 32: 2021-05-23        114 arimax_prediction 40.94258
## 33: 2021-05-24        146 arimax_prediction 40.94258
## 34: 2021-05-25         85 arimax_prediction 40.94258
## 35: 2021-05-26        125 arimax_prediction 40.94258
## 36: 2021-05-27         71 arimax_prediction 40.94258
## 37: 2021-05-28         52 arimax_prediction 40.94258
## 38: 2021-05-29         67 arimax_prediction 40.94258
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias      MAPE
## 1:  arima_prediction 19 111.3158 43.64128 0.3920493 0.5511749 0.4878333
## 2: arimax_prediction 19 111.3158 43.64128 0.3920493 0.6321944 0.5805426
##        RMSE      MAD      MADP     WMAPE
## 1: 74.66883 61.35447 0.5511749 0.5511749
## 2: 82.19921 70.37321 0.6321944 0.6321944

View of data shows that arima prediction is better in terms of MAPE.

#Product 7

We define our functions for arima and arimax models.

product7_arima_model
## Series: product7_dec_daily$random 
## ARIMA(1,0,2)(2,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     ma2     sar1     sar2    sma1
##       -0.5086  0.8848  0.3308  -0.4717  -0.0636  0.5021
## s.e.   0.1020  0.0937  0.0239   0.1614   0.0238  0.1614
## 
## sigma^2 estimated as 1046:  log likelihood=-12685.84
## AIC=25385.69   AICc=25385.73   BIC=25426.71
arimax_product7
## Series: product7_data$sold_count 
## Regression with ARIMA(3,0,2) errors 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     ma2  intercept    xreg
##       0.9463  -0.7317  0.3664  -0.1640  0.7235    53.9890  0.0140
## s.e.  0.1536   0.2444  0.1359   0.1338  0.1285     8.3913  0.0028
## 
## sigma^2 estimated as 1515:  log likelihood=-1886.99
## AIC=3789.98   AICc=3790.37   BIC=3821.33
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(2, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product7

    forecasted = forecast(fitted, xreg = tail(visit_count_mat_p7[!is.na(visit_count_mat_p7)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product7_data[event_date <= current_date]
    forecast_data = product7_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11        126  arima_prediction 50.35676
##  2: 2021-05-12        134  arima_prediction 50.30671
##  3: 2021-05-13        170  arima_prediction 50.37569
##  4: 2021-05-14        178  arima_prediction 50.46844
##  5: 2021-05-15        222  arima_prediction 49.54439
##  6: 2021-05-16         98  arima_prediction 49.68743
##  7: 2021-05-17         72  arima_prediction 49.69112
##  8: 2021-05-18         94  arima_prediction 50.58876
##  9: 2021-05-19         85  arima_prediction 50.57940
## 10: 2021-05-20         77  arima_prediction 49.63221
## 11: 2021-05-21         95  arima_prediction 49.48800
## 12: 2021-05-22        104  arima_prediction 49.57158
## 13: 2021-05-23        114  arima_prediction 49.52115
## 14: 2021-05-24        146  arima_prediction 49.54223
## 15: 2021-05-25         85  arima_prediction 49.67193
## 16: 2021-05-26        125  arima_prediction 49.59822
## 17: 2021-05-27         71  arima_prediction 49.65434
## 18: 2021-05-28         52  arima_prediction 49.68561
## 19: 2021-05-29         67  arima_prediction 49.61904
## 20: 2021-05-11        126 arimax_prediction 45.17516
## 21: 2021-05-12        134 arimax_prediction 45.17516
## 22: 2021-05-13        170 arimax_prediction 45.17516
## 23: 2021-05-14        178 arimax_prediction 45.17516
## 24: 2021-05-15        222 arimax_prediction 45.17516
## 25: 2021-05-16         98 arimax_prediction 45.17516
## 26: 2021-05-17         72 arimax_prediction 45.17516
## 27: 2021-05-18         94 arimax_prediction 45.17516
## 28: 2021-05-19         85 arimax_prediction 45.17516
## 29: 2021-05-20         77 arimax_prediction 45.17516
## 30: 2021-05-21         95 arimax_prediction 45.17516
## 31: 2021-05-22        104 arimax_prediction 45.17516
## 32: 2021-05-23        114 arimax_prediction 45.17516
## 33: 2021-05-24        146 arimax_prediction 45.17516
## 34: 2021-05-25         85 arimax_prediction 45.17516
## 35: 2021-05-26        125 arimax_prediction 45.17516
## 36: 2021-05-27         71 arimax_prediction 45.17516
## 37: 2021-05-28         52 arimax_prediction 45.17516
## 38: 2021-05-29         67 arimax_prediction 45.17516
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias      MAPE    RMSE
## 1:  arima_prediction 19 111.3158 43.64128 0.3920493 0.5519702 0.4894221 74.6514
## 2: arimax_prediction 19 111.3158 43.64128 0.3920493 0.5941711 0.5371797 78.6060
##         MAD      MADP     WMAPE
## 1: 61.44300 0.5519702 0.5519702
## 2: 66.14063 0.5941711 0.5941711

View of data shows that arima prediction is better in terms of MAPE.

#Product 8

We define our functions for arima and arimax models.

# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(0, 0, 1), seasonal = c(0, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product8

    forecasted = forecast(fitted, xreg = tail(category_sold_mat_p8[!is.na(category_sold_mat_p8)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product8_data[event_date <= current_date]
    forecast_data = product8_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable     value
##  1: 2021-05-11        204  arima_prediction 273.36571
##  2: 2021-05-12        263  arima_prediction 272.91825
##  3: 2021-05-13         88  arima_prediction 272.91297
##  4: 2021-05-14        128  arima_prediction 272.01190
##  5: 2021-05-15        144  arima_prediction 271.90366
##  6: 2021-05-16        198  arima_prediction 271.44876
##  7: 2021-05-17        940  arima_prediction 271.28317
##  8: 2021-05-18        854  arima_prediction 273.06861
##  9: 2021-05-19        611  arima_prediction 273.34389
## 10: 2021-05-20        768  arima_prediction 273.65636
## 11: 2021-05-21        581  arima_prediction 274.35769
## 12: 2021-05-22        606  arima_prediction 274.39089
## 13: 2021-05-23        727  arima_prediction 275.08719
## 14: 2021-05-24        632  arima_prediction 275.43295
## 15: 2021-05-25        568  arima_prediction 275.75340
## 16: 2021-05-26        589  arima_prediction 276.03369
## 17: 2021-05-27        588  arima_prediction 276.37822
## 18: 2021-05-28        538  arima_prediction 276.66095
## 19: 2021-05-29        528  arima_prediction 276.82738
## 20: 2021-05-11        204 arimax_prediction  37.42197
## 21: 2021-05-12        263 arimax_prediction  37.42197
## 22: 2021-05-13         88 arimax_prediction  37.42197
## 23: 2021-05-14        128 arimax_prediction  37.42197
## 24: 2021-05-15        144 arimax_prediction  37.42197
## 25: 2021-05-16        198 arimax_prediction  37.42197
## 26: 2021-05-17        940 arimax_prediction  37.42197
## 27: 2021-05-18        854 arimax_prediction  37.42197
## 28: 2021-05-19        611 arimax_prediction  37.42197
## 29: 2021-05-20        768 arimax_prediction  37.42197
## 30: 2021-05-21        581 arimax_prediction  37.42197
## 31: 2021-05-22        606 arimax_prediction  37.42197
## 32: 2021-05-23        727 arimax_prediction  37.42197
## 33: 2021-05-24        632 arimax_prediction  37.42197
## 34: 2021-05-25        568 arimax_prediction  37.42197
## 35: 2021-05-26        589 arimax_prediction  37.42197
## 36: 2021-05-27        588 arimax_prediction  37.42197
## 37: 2021-05-28        538 arimax_prediction  37.42197
## 38: 2021-05-29        528 arimax_prediction  37.42197
##     event_date sold_count          variable     value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias      MAPE
## 1:  arima_prediction 19 502.8947 256.2574 0.5095648 0.4550669 0.6447023
## 2: arimax_prediction 19 502.8947 256.2574 0.5095648 0.9255869 0.8812548
##        RMSE      MAD      MADP     WMAPE
## 1: 338.0950 293.0151 0.5826569 0.5826569
## 2: 528.0877 465.4728 0.9255869 0.9255869

View of data shows that arima prediction is better in terms of MAPE.

#Product 9

We define our functions for arima and arimax models.

product9_arima_model
## Series: product9_dec_daily$random 
## ARIMA(1,0,1)(1,0,1)[7] with zero mean 
## 
## Coefficients:
##           ar1     ma1     sar1    sma1
##       -0.0651  0.4105  -0.0801  0.2063
## s.e.   0.0385  0.0325   0.1519  0.1494
## 
## sigma^2 estimated as 1137:  log likelihood=-12795.32
## AIC=25600.64   AICc=25600.66   BIC=25629.94
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arima(input_series, order = c(1, 0, 1), seasonal = c(1, 
        0, 1))

    forecasted = forecast(fitted, h = forecast_ahead)
    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count", 
    is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
    command_string = sprintf("input_series=data$%s", target_name)
    print(command_string)
    eval(parse(text = command_string))

    fitted = arimax_product9

    forecasted = forecast(fitted, xreg = tail(price_diff_mat_p9[!is.na(price_diff_mat_p9)], 
        1), h = nahead)

    return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1

results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
    current_date = test_dates[i] - forecast_ahead
    print(test_dates[i])
    past_data = product8_data[event_date <= current_date]
    forecast_data = product8_data[event_date == test_dates[i]]

    arima_forecast = forecast_with_arima(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
    arimax_forecast = forecast_with_arimax(past_data, forecast_ahead, 
        "sold_count", is_trace = F)
    forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]

    results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)

melted_result = melt(overall_results, c("event_date", "sold_count"), 
    c("arima_prediction", "arimax_prediction"))

We turned the results into melted results(long form).

melted_result
##     event_date sold_count          variable    value
##  1: 2021-05-11        204  arima_prediction 220.0599
##  2: 2021-05-12        263  arima_prediction 219.7993
##  3: 2021-05-13         88  arima_prediction 219.8191
##  4: 2021-05-14        128  arima_prediction 219.0358
##  5: 2021-05-15        144  arima_prediction 218.8570
##  6: 2021-05-16        198  arima_prediction 218.6456
##  7: 2021-05-17        940  arima_prediction 218.5281
##  8: 2021-05-18        854  arima_prediction 220.7967
##  9: 2021-05-19        611  arima_prediction 220.8536
## 10: 2021-05-20        768  arima_prediction 220.7399
## 11: 2021-05-21        581  arima_prediction 221.2707
## 12: 2021-05-22        606  arima_prediction 221.1768
## 13: 2021-05-23        727  arima_prediction 221.3741
## 14: 2021-05-24        632  arima_prediction 221.8013
## 15: 2021-05-25        568  arima_prediction 221.8081
## 16: 2021-05-26        589  arima_prediction 221.8768
## 17: 2021-05-27        588  arima_prediction 222.0597
## 18: 2021-05-28        538  arima_prediction 222.2066
## 19: 2021-05-29        528  arima_prediction 222.2686
## 20: 2021-05-11        204 arimax_prediction  63.9130
## 21: 2021-05-12        263 arimax_prediction  63.9130
## 22: 2021-05-13         88 arimax_prediction  63.9130
## 23: 2021-05-14        128 arimax_prediction  63.9130
## 24: 2021-05-15        144 arimax_prediction  63.9130
## 25: 2021-05-16        198 arimax_prediction  63.9130
## 26: 2021-05-17        940 arimax_prediction  63.9130
## 27: 2021-05-18        854 arimax_prediction  63.9130
## 28: 2021-05-19        611 arimax_prediction  63.9130
## 29: 2021-05-20        768 arimax_prediction  63.9130
## 30: 2021-05-21        581 arimax_prediction  63.9130
## 31: 2021-05-22        606 arimax_prediction  63.9130
## 32: 2021-05-23        727 arimax_prediction  63.9130
## 33: 2021-05-24        632 arimax_prediction  63.9130
## 34: 2021-05-25        568 arimax_prediction  63.9130
## 35: 2021-05-26        589 arimax_prediction  63.9130
## 36: 2021-05-27        588 arimax_prediction  63.9130
## 37: 2021-05-28        538 arimax_prediction  63.9130
## 38: 2021-05-29        528 arimax_prediction  63.9130
##     event_date sold_count          variable    value
accu = function(actual, forecast) {
    n = length(actual)
    error = actual - forecast
    mean = mean(actual)
    sd = sd(actual)
    CV = sd/mean
    FBias = sum(error)/sum(actual)
    MAPE = sum(abs(error/actual))/n
    RMSE = sqrt(sum(error^2)/n)
    MAD = sum(abs(error))/n
    MADP = sum(abs(error))/sum(abs(actual))
    WMAPE = MAD/mean
    l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP, 
        WMAPE)
    return(l)
}

To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.

performance = melted_result[, accu(sold_count, value), by = list(variable)]

performance
##             variable  n     mean       sd        CV     FBias      MAPE
## 1:  arima_prediction 19 502.8947 256.2574 0.5095648 0.5611745 0.6082225
## 2: arimax_prediction 19 502.8947 256.2574 0.5095648 0.8729098 0.7971949
##        RMSE      MAD      MADP     WMAPE
## 1: 376.2800 317.4135 0.6311729 0.6311729
## 2: 504.8927 438.9817 0.8729098 0.8729098

View of data shows that arima prediction is better in terms of MAPE.